Load the necessary libraries
library(car) #for regression diagnostics
library(broom) #for tidy output
library(ggfortify) #for model diagnostics
library(sjPlot) #for outputs
library(knitr) #for kable
library(effects) #for partial effects plots
library(emmeans) #for estimating marginal means
library(ggeffects) #for plotting marginal means
library(MASS) #for glm.nb
library(MuMIn) #for AICc
library(tidyverse) #for data wrangling
library(modelr) #for auxillary modelling functions
library(performance) #for residuals diagnostics
library(see) #for plotting residuals
library(DHARMa) #for residual diagnostics plots
library(patchwork) #grid of plots
library(scales) #for more scales
theme_set(theme_classic())
Here is a modified example from Quinn and Keough (2002). Day and Quinn (1989) described an experiment that examined how rock surface type affected the recruitment of barnacles to a rocky shore. The experiment had a single factor, surface type, with 4 treatments or levels: algal species 1 (ALG1), algal species 2 (ALG2), naturally bare surfaces (NB) and artificially scraped bare surfaces (S). There were 5 replicate plots for each surface type and the response (dependent) variable was the number of newly recruited barnacles on each plot after 4 weeks.
Six-plated barnacle
Format of day.csv data files
| treat | barnacle |
|---|---|
| ALG1 | 27 |
| .. | .. |
| ALG2 | 24 |
| .. | .. |
| NB | 9 |
| .. | .. |
| S | 12 |
| .. | .. |
| treat | Categorical listing of surface types. ALG1 = algal species 1, ALG2 = algal species 2, NB = naturally bare surface, S = scraped bare surface. |
| barnacle | The number of newly recruited barnacles on each plot after 4 weeks. |
As we are going to treat Treatment as a categorical predictor, we will specifically declare it as such straight after importing the data.
day = read_csv('../data/day.csv', trim_ws=TRUE)
glimpse(day)
## Rows: 20
## Columns: 2
## $ TREAT <chr> "ALG1", "ALG1", "ALG1", "ALG1", "ALG1", "ALG2", "ALG2", "ALG…
## $ BARNACLE <dbl> 27, 19, 18, 23, 25, 24, 33, 27, 26, 32, 9, 13, 17, 14, 22, 1…
day <- day %>% janitor::clean_names() %>%
mutate(treat = fct_relevel(treat, c("NB", "S", "ALG1", "ALG2")))
Model formula: \[ y_i \sim{} \mathcal{Pois}(\lambda_i)\\ \mu_i = \boldsymbol{\beta} \bf{X_i} \]
where \(\boldsymbol{\beta}\) is a vector of effects parameters and \(\bf{X}\) is a model matrix representing the intercept and treatment contrasts for the effects of Treatment on barnacle recruitment.
ggplot(day, aes(y=barnacle, x=treat)) +
geom_boxplot()+
geom_point(color='red')
ggplot(day, aes(y=barnacle, x=treat)) +
geom_violin()+
geom_point(color='red')
day_mod <- glm(barnacle ~ treat, data = day, family = poisson(link = "log"))
autoplot(day_mod, which = 1:6)
DHARMa::simulateResiduals(day_mod, plot=T)
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.8519155 0.2442125 0.1780693 0.5192783 0.7018186 0.2069486 0.845717 0.4237525 0.3734334 0.797562 0.05894113 0.3008211 0.7262854 0.3287856 0.9518321 0.4402874 0.06980402 0.6823056 0.9620096 0.2676171 ...
All looks good!
Cook’s d is not needed if you only have categorical predictors
plot_model(day_mod, type = 'eff')
## $treat
summary(day_mod)
##
## Call:
## glm(formula = barnacle ~ treat, family = poisson(link = "log"),
## data = day)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6748 -0.6522 -0.2630 0.5699 1.7380
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7081 0.1155 23.452 < 2e-16 ***
## treatS -0.1278 0.1688 -0.757 0.4488
## treatALG1 0.4010 0.1492 2.688 0.0072 **
## treatALG2 0.6383 0.1427 4.472 7.75e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 54.123 on 19 degrees of freedom
## Residual deviance: 17.214 on 16 degrees of freedom
## AIC: 120.34
##
## Number of Fisher Scoring iterations: 4
exp(2.7081)
## [1] 15.00075
exp(2.7081 + -0.1278)
## [1] 13.2011
exp(2.7081 + 0.4010)
## [1] 22.40087
exp(2.7081 + 0.6383)
## [1] 28.40031
exp(2.7081) = 15.0 is the number of barnacles in the NB treatment 13.2 is the number of barnacles in the S treatment (also exp(-0.1278) is the factor or fractional difference) 22.4 is the number of barnacles in the ALG1 treatment 28.4 is the number of barnacles in the ALG2 treatment
tidy(day_mod, confint = T)
tidy(day_mod, confint = T, exponentiate = T)
# fancy table:
# sjPlot::tab_model(day_mod, show.se = T, show.aic = T)
day_mod %>% emmeans(~treat, type = 'response') %>% plot(add.data=T)
emmeans(day_mod, pairwise~treat)
## $emmeans
## treat emmean SE df asymp.LCL asymp.UCL
## NB 2.71 0.1155 Inf 2.48 2.93
## S 2.58 0.1231 Inf 2.34 2.82
## ALG1 3.11 0.0945 Inf 2.92 3.29
## ALG2 3.35 0.0839 Inf 3.18 3.51
##
## Results are given on the log (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## NB - S 0.128 0.169 Inf 0.757 0.8735
## NB - ALG1 -0.401 0.149 Inf -2.688 0.0362
## NB - ALG2 -0.638 0.143 Inf -4.472 <.0001
## S - ALG1 -0.529 0.155 Inf -3.408 0.0037
## S - ALG2 -0.766 0.149 Inf -5.143 <.0001
## ALG1 - ALG2 -0.237 0.126 Inf -1.878 0.2375
##
## Results are given on the log (not the response) scale.
## P value adjustment: tukey method for comparing a family of 4 estimates
emmeans(day_mod, pairwise~treat, type = 'response')
## $emmeans
## treat rate SE df asymp.LCL asymp.UCL
## NB 15.0 1.73 Inf 12.0 18.8
## S 13.2 1.62 Inf 10.4 16.8
## ALG1 22.4 2.12 Inf 18.6 27.0
## ALG2 28.4 2.38 Inf 24.1 33.5
##
## Confidence level used: 0.95
## Intervals are back-transformed from the log scale
##
## $contrasts
## contrast ratio SE df z.ratio p.value
## NB / S 1.136 0.1918 Inf 0.757 0.8735
## NB / ALG1 0.670 0.0999 Inf -2.688 0.0362
## NB / ALG2 0.528 0.0754 Inf -4.472 <.0001
## S / ALG1 0.589 0.0914 Inf -3.408 0.0037
## S / ALG2 0.465 0.0692 Inf -5.143 <.0001
## ALG1 / ALG2 0.789 0.0997 Inf -1.878 0.2375
##
## P value adjustment: tukey method for comparing a family of 4 estimates
## Tests are performed on the log scale
# emmeans(day_mod, ~treat) %>% pairs() %>% confint() # similar to second table
1/0.465 # ALG2 / S
## [1] 2.150538
ALG2 is 2.15 times more than S
emmeans(day_mod, ~ treat) %>% # get differences
regrid() %>% # does backtransform of the differences BEFORE pair-wise comparisons
pairs %>% # get pair-wise absolute values of differences
confint() # get confidence interval limits for these differences
ALG2 has 13.4 more units of barnacles than NB ALG1 has 7.4 more units of barnacles than NB ALG2 has 15.2 more units of barnacles than S ALG1 has 9.2 more units of barnacles than S
emmeans(day_mod, ~ treat) %>%
regrid() %>%
pairs() %>%
confint()
Define your own
Compare:
Two rules to planned contrasts:
cmat <- cbind('Alg1_Alg2' = c(1, -1, 0, 0),
'NB_S' = c(0, 0, 1, -1),
'Alg_Bare' = c(0.5, 0.5, -0.5, -0.5))
Need to check that they’re all independent
crossprod(cmat) #each column against each other column
## Alg1_Alg2 NB_S Alg_Bare
## Alg1_Alg2 2 0 0
## NB_S 0 2 0
## Alg_Bare 0 0 1
day_mod %>%
emmeans(~treat, contr = list(treat = cmat), type = 'response') #'response' to transform
## $emmeans
## treat rate SE df asymp.LCL asymp.UCL
## NB 15.0 1.73 Inf 12.0 18.8
## S 13.2 1.62 Inf 10.4 16.8
## ALG1 22.4 2.12 Inf 18.6 27.0
## ALG2 28.4 2.38 Inf 24.1 33.5
##
## Confidence level used: 0.95
## Intervals are back-transformed from the log scale
##
## $contrasts
## contrast ratio SE df z.ratio p.value
## treat.Alg1_Alg2 1.136 0.1918 Inf 0.757 0.4488
## treat.NB_S 0.789 0.0997 Inf -1.878 0.0604
## treat.Alg_Bare 0.558 0.0588 Inf -5.536 <.0001
##
## Tests are performed on the log scale
day_mod %>%
emmeans(~treat) %>%
regrid() %>%
contrast(list(TREAT = cmat))
## contrast estimate SE df z.ratio p.value
## TREAT.Alg1_Alg2 1.8 2.37 Inf 0.758 0.4485
## TREAT.NB_S -6.0 3.19 Inf -1.882 0.0598
## TREAT.Alg_Bare -11.3 1.99 Inf -5.686 <.0001
newdata <- emmeans(day_mod, ~treat, type = 'response') %>%
as.data.frame() %>%
rename(barnacle = rate, lwr = asymp.LCL, upr = asymp.UCL)
ggplot(newdata, aes(y = barnacle, x = treat)) +
geom_pointrange(aes(ymin = lwr, ymax = upr)) +
geom_jitter(data = day, col="red", height = 0, width=0.1)
newdata_planned <- day_mod %>%
emmeans(~treat) %>%
regrid() %>%
contrast(list(treat = cmat)) %>%
confint() %>%
rename(lwr = asymp.LCL, upr = asymp.UCL)
ggplot(data = newdata_planned, aes(y = estimate, x = contrast)) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_pointrange(aes(ymin = lwr, ymax = upr)) +
theme_classic() +
coord_flip()
Ways of doing multiple plots with inset ‘a’ and ’b’s:
#
# (g1 + ggtitle('a)')) + (g2 + ggtitle('b)'))
#OR
# g1 + annotate(geom = 'text', y = Inf, x = -Inf, label = 'a)',
# hjust = -0.5, vjust = 1)